The data contains features extracted from the silhouette of vehicles in different angles. Four "Corgie" model vehicles were used for the experiment: a double decker bus, Cheverolet van, Saab 9000 and an Opel Manta 400 cars. This particular combination of vehicles was chosen with the expectation that the bus, van and either one of the cars would be readily distinguishable, but it would be more difficult to distinguish between the cars. The purpose is to classify a given silhouette as one of three types of vehicle(car,bus,van), using a set of features extracted from the silhouette. The vehicle may be viewed from one of many different angles.
#Import all necessary modules and load the data
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from scipy.stats import zscore
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.metrics import recall_score,precision_score,confusion_matrix,classification_report
df=pd.read_csv('vehicle-1.csv')
df.head(10)
df.tail(10)
df.dtypes
df.shape
df.info()
print(df.isna().sum().sum())
print(df.isnull().sum().sum())
df=df.fillna(df.mean())
Replacing Null values with mean values
print(df.isna().sum().sum())
print(df.isnull().sum().sum())
df.describe().transpose()
df['class'].value_counts()
There are 429 cars,218 bus and 199 van
plt.figure(figsize= (30,50)) # Set the figure size
pos = 1 # a variable to manage the position of the subplot in the overall plot
for feature in df.columns: # for-loop to iterate over every attribute whose distribution is to be visualized
plt.subplot(8, 3, pos) # plot grid
if feature not in ['class']: # Plot histogram for all the continuous columns
sns.distplot(df[feature], kde= True )
else:
sns.countplot(df[feature], palette= 'jet_r') # Plot bar chart for all the categorical columns
pos += 1 # to plot over the grid one by one
#Boxplot to view outliers of Circularity,distance_circularity,max.length_aspect_ratio,pr.axis_aspect_ratio,
#skewness_about.2 and scaled_variance.
sns.set(style="darkgrid")
#Set up the matplotlib figure
f, axes = plt.subplots(2,3, figsize=(15,12))
sns.despine(left=True)
sns.boxplot(df.circularity, color="r", ax=axes[0,0],)
sns.boxplot(df['max.length_aspect_ratio'], color="g", ax=axes[0,1])
sns.boxplot(df['pr.axis_aspect_ratio'], color="b", ax=axes[0,2])
sns.boxplot(df.scaled_variance, color="r", ax=axes[1,0])
sns.boxplot(df.distance_circularity, color="g", ax=axes[1,1])
axes[1,2].remove();
sns.pairplot(df, hue = 'class', diag_kind='kde',height=3)
plt.show()
sns.set(style="white")
# Compute the correlation matrix
corr = df.corr()
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(20, 18))
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, annot=True,cmap='YlGnBu', vmax=.3, center=0,
square=True, linewidths=.5, cbar_kws={"shrink": .5})
Observation:-
- Elongatedness has high correlation with scatter_ratio(95%) and Scaled_variance.1(93%)
- We can drop scatter_ratio(95%) and Scaled_variance.1 varialble as these have higher corelation with Elongatedness.
- Class variable has 3 categorigal variable , so we have to convert that in one hot encoding before feeding to our model.
adf=df.drop(['scatter_ratio','scaled_variance.1'],axis=1)
adf.head(10)
# Converting class variables to Label encoding
from sklearn.preprocessing import LabelEncoder
adf['class_label_encoded'] = LabelEncoder().fit_transform(adf['class'])
adf.head(10)
New label encoded values of 'class' variable are bus = 0 car = 1 van = 2
#drop class variable
adf=adf.drop(['class'],axis=1)
adf
from sklearn.model_selection import train_test_split
# To calculate the accuracy score of the model
from sklearn.metrics import accuracy_score, confusion_matrix
target = adf['class_label_encoded']
features = adf.drop(['class_label_encoded'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(features,target, test_size = 0.2, random_state = 10)
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.30,random_state=20)
sc=StandardScaler()
X_train_sd= sc.fit_transform(X_train)
X_test_sd = sc.transform(X_test)
from sklearn.svm import SVC
# Building a Support Vector Machine on train data with kernel = 'Linear'
svc_model = SVC(C= .1, kernel='linear', gamma= 1)
svc_model.fit(X_train_sd, y_train)
prediction = svc_model.predict(X_test_sd)
# check the accuracy on the training set
print(svc_model.score(X_train_sd, y_train))
print(svc_model.score(X_test_sd, y_test))
# Building a Support Vector Machine on train data with kernel = 'rbf'
%timeit pass
svc_model = SVC(kernel='rbf')
svc_model.fit(X_train_sd, y_train)
# check the accuracy on the training set
print("Train accuracy- ",svc_model.score(X_train_sd, y_train))
print("Test accuracy- ",svc_model.score(X_test_sd, y_test))
#Building a Support Vector Machine on train data(changing the kernel)
svc_model = SVC(kernel='poly')
svc_model.fit(X_train_sd, y_train)
prediction = svc_model.predict(X_test_sd)
print(svc_model.score(X_train_sd, y_train))
print(svc_model.score(X_test_sd, y_test))
svc_model = SVC(kernel='sigmoid')
svc_model.fit(X_train_sd, y_train)
prediction = svc_model.predict(X_test_sd)
print(svc_model.score(X_train_sd, y_train))
print(svc_model.score(X_test_sd, y_test))
Observation:- SVM gives train accuracy score of 0.97 and test accuracy score of 0.95 (when kernel='rbf').
# generating the covariance matrix and the eigen values for the PCA analysis
cov_matrix = np.cov(X_train_sd.T) # the relevanat covariance matrix
print('Covariance Matrix \n%s', cov_matrix)
#generating the eigen values and the eigen vectors
e_vals, e_vecs = np.linalg.eig(cov_matrix)
print('Eigenvectors \n%s' %e_vecs)
print('\nEigenvalues \n%s' %e_vals)
# the "cumulative variance explained" analysis
tot = sum(e_vals)
var_exp = [( i /tot ) * 100 for i in sorted(e_vals, reverse=True)]
cum_var_exp = np.cumsum(var_exp)
print("Cumulative Variance Explained", cum_var_exp)
# Plotting the variance expalained by the principal components and the cumulative variance explained.
plt.figure(figsize=(10 , 5))
plt.bar(range(1, e_vals.size + 1), var_exp, alpha = 0.5, align = 'center', label = 'Individual explained variance')
plt.step(range(1, e_vals.size + 1), cum_var_exp, where='mid', label = 'Cumulative explained variance')
plt.ylabel('Explained Variance Ratio')
plt.xlabel('Principal Components')
plt.legend(loc = 'best')
plt.tight_layout()
plt.show()
eigen_pairs = [(np.abs(e_vals[i]), e_vecs[:,i]) for i in range(len(e_vals))]
eigen_pairs.sort(reverse=True)
eigen_pairs[:17]
# # generating dimensionally reduced datasets
# w = np.hstack((eigen_pairs[0][3].reshape(16,4),
# eigen_pairs[1][3].reshape(16,4),
# eigen_pairs[2][3].reshape(16,4),
# eigen_pairs[3][3].reshape(16,4)))
# print('Matrix W:\n', w)
# X_sd_pca = X_train_sd.dot(w)
# X_test_sd_pca = X_test_sd.dot(w)
We can get 95% accuracy using 7 eigen vectors.
from sklearn.model_selection import train_test_split
# To calculate the accuracy score of the model
from sklearn.metrics import accuracy_score, confusion_matrix
# Splitting the data into train and test with same random state value.
target = adf['class_label_encoded']
features = adf.drop(['class_label_encoded'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(features,target, test_size = 0.2, random_state = 10)
# scaling the data using the standard scaler
from sklearn.preprocessing import StandardScaler
X_train_sd = StandardScaler().fit_transform(X_train)
X_test_sd = StandardScaler().fit_transform(X_test)
from sklearn.decomposition import PCA
pca = PCA(n_components=7)
X_train_pca = pca.fit_transform(X_train_sd)
X_test_pca = pca.transform(X_test_sd)
from sklearn.svm import SVC
# Building a Support Vector Machine on train data with kernel = 'Linear'
svc_model = SVC(C= .1, kernel='linear', gamma= 1)
svc_model.fit(X_train_pca, y_train)
prediction = svc_model.predict(X_test_pca)
# check the accuracy on the training set
print(svc_model.score(X_train_pca, y_train))
print(svc_model.score(X_test_pca, y_test))
# Building a Support Vector Machine on train data with kernel = 'rbf'
%timeit pass
svc_model = SVC(kernel='rbf')
svc_model.fit(X_train_pca, y_train)
# check the accuracy on the training set
print("Train accuracy - ",svc_model.score(X_train_pca, y_train))
print("Test accuracy- ",svc_model.score(X_test_pca, y_test))
#Building a Support Vector Machine on train data(changing the kernel)
svc_model = SVC(kernel='poly')
svc_model.fit(X_train_pca, y_train)
print(svc_model.score(X_train_pca, y_train))
print(svc_model.score(X_test_pca, y_test))
svc_model = SVC(kernel='sigmoid')
svc_model.fit(X_train_pca, y_train)
prediction = svc_model.predict(X_test_pca)
print(svc_model.score(X_train_pca, y_train))
print(svc_model.score(X_test_pca, y_test))
Observation :- When we reduce dimension from 16 to 7 using PCA, we get train accuracy = 0.96 and test accuracy = 0.89 (with kernel = 'rbf').
scores = {' ': ['Before PCA','After PCA'],
' Train accuracy ' : [0.97,0.96],
' Test accuracy': [0.95,0.89],
' Execution time': ['6.19 ns','6.11 ns']
}
df = pd.DataFrame(scores)
print (df)